In [1]:
#@title Default title text
! git clone https://github.com/tayuny/Data_Viz_proj.git
Cloning into 'Data_Viz_proj'...
remote: Enumerating objects: 12, done.
remote: Counting objects: 100% (12/12), done.
remote: Compressing objects: 100% (10/10), done.
remote: Total 12 (delta 4), reused 0 (delta 0), pack-reused 0
Unpacking objects: 100% (12/12), done.
In [2]:
#@title Default title text
! pip install geopandas
import altair as alt
import pandas as pd
import numpy as np
import geopandas as geop
from shapely.geometry import Point
import re
import json
import warnings
from io import BytesIO
import requests
warnings.filterwarnings('ignore')
Collecting geopandas
  Downloading https://files.pythonhosted.org/packages/52/4f/6440a63c9367d981a91de458467eed4a8e259a26f24158071b610a1ed1dd/geopandas-0.6.3-py2.py3-none-any.whl (920kB)
     |████████████████████████████████| 921kB 2.8MB/s 
Collecting pyproj
  Downloading https://files.pythonhosted.org/packages/d6/70/eedc98cd52b86de24a1589c762612a98bea26cde649ffdd60c1db396cce8/pyproj-2.4.2.post1-cp36-cp36m-manylinux2010_x86_64.whl (10.1MB)
     |████████████████████████████████| 10.1MB 41.5MB/s 
Requirement already satisfied: pandas>=0.23.0 in /usr/local/lib/python3.6/dist-packages (from geopandas) (0.25.3)
Collecting fiona
  Downloading https://files.pythonhosted.org/packages/50/f7/9899f8a9a2e38601472fe1079ce5088f58833221c8b8507d8b5eafd5404a/Fiona-1.8.13-cp36-cp36m-manylinux1_x86_64.whl (11.8MB)
     |████████████████████████████████| 11.8MB 26.0MB/s 
Requirement already satisfied: shapely in /usr/local/lib/python3.6/dist-packages (from geopandas) (1.7.0)
Requirement already satisfied: pytz>=2017.2 in /usr/local/lib/python3.6/dist-packages (from pandas>=0.23.0->geopandas) (2018.9)
Requirement already satisfied: numpy>=1.13.3 in /usr/local/lib/python3.6/dist-packages (from pandas>=0.23.0->geopandas) (1.17.5)
Requirement already satisfied: python-dateutil>=2.6.1 in /usr/local/lib/python3.6/dist-packages (from pandas>=0.23.0->geopandas) (2.6.1)
Collecting cligj>=0.5
  Downloading https://files.pythonhosted.org/packages/e4/be/30a58b4b0733850280d01f8bd132591b4668ed5c7046761098d665ac2174/cligj-0.5.0-py3-none-any.whl
Collecting munch
  Downloading https://files.pythonhosted.org/packages/cc/ab/85d8da5c9a45e072301beb37ad7f833cd344e04c817d97e0cc75681d248f/munch-2.5.0-py2.py3-none-any.whl
Requirement already satisfied: click<8,>=4.0 in /usr/local/lib/python3.6/dist-packages (from fiona->geopandas) (7.0)
Requirement already satisfied: attrs>=17 in /usr/local/lib/python3.6/dist-packages (from fiona->geopandas) (19.3.0)
Collecting click-plugins>=1.0
  Downloading https://files.pythonhosted.org/packages/e9/da/824b92d9942f4e472702488857914bdd50f73021efea15b4cad9aca8ecef/click_plugins-1.1.1-py2.py3-none-any.whl
Requirement already satisfied: six>=1.7 in /usr/local/lib/python3.6/dist-packages (from fiona->geopandas) (1.12.0)
Installing collected packages: pyproj, cligj, munch, click-plugins, fiona, geopandas
Successfully installed click-plugins-1.1.1 cligj-0.5.0 fiona-1.8.13 geopandas-0.6.3 munch-2.5.0 pyproj-2.4.2.post1
In [3]:
#@title Default title text
c_color_theme = ["#9cceff", "#2794ff", "#138bff", "#0063c4", "#0059b0", "#004f9c", "#004589"]
d_color_theme2 = ["#634c53", "#4b87ff", "#f2e522", "#cdd7d9", "#b0b0ff", "#b0d8ff", "#f27d22", "#ffcb64"]

def general_theme(*args, **kwargs):
    return {'width': 400, 'height': 500,
            'config': {"range": {"category": d_color_theme2, "diverging": c_color_theme},
                       'legend': {'titleFontSize': 12, 'labelFontSize': 12, "strokeColor":'gray',
                                  "fillColor":'#fefdee', "padding":12}, 
                       'axis': {'titleFontSize': 12, 'labelFontSize': 12},
                       "title":{"font": "Lato", "fontSize": 16, "color":"black"},
                       "header":{"titleFontSize":12}}}

alt.themes.register('general_theme', general_theme)
alt.themes.enable('general_theme')
Out[3]:
ThemeRegistry.enable('general_theme')
In [0]:
#@title Default title text
full_data = pd.read_csv("Data_Viz_proj/data/full_data50.csv")
selected_columns = ['AIN', 'ZIPcode5', 'RollYear', 'CT','GeneralUseType', 'TotalValue', 'EffectiveYearBuilt', 'SQFTmain', 'Bedrooms', 'Bathrooms',
                    'unemployment', 'pop', 'gini', 'median_income','poverty_rate', 'pop75', 'Travel_Time701902', 'Travel_Time267800', 'TotalValue2017',
                    'TotalValue2016', 'TotalValue2015', 'TotalValue2014']
main_data = full_data[selected_columns]
main_data = main_data[main_data["median_income"] != "-"]
main_data["median_income"] = main_data["median_income"].astype(float)
plot_data = main_data.groupby("CT").median()
In [0]:
#@title Default title text
plot_data["med_inc_b"] = 0
over_all_mid_income = plot_data.groupby("CT").mean()["median_income"].median()
plot_data.loc[plot_data["median_income"] >= over_all_mid_income, "med_inc_b"] = 1
plot_data["CT"] = plot_data.index
plot_data["building_age"] = 2018 - plot_data["EffectiveYearBuilt"]

The main purpose of the project is to evaluate the effect from the traveling time to commercial districts to housing value. First of all, a clear definition of commercial district is necessary for this purpose.

In [6]:
#@title Default title text
nearest10 = list(main_data.groupby("CT").mean()["Travel_Time701902"].sort_values(ascending=True)[:10].index.astype(str))
usage_CT_data = ((main_data[(main_data["GeneralUseType"] != "(missing)") & (main_data["GeneralUseType"] != "Dry Farm")]
                  ).groupby(["CT", "GeneralUseType"]).size() / \
                 main_data.groupby(["CT"]).size()
                 ).to_frame(name="use_ratio").reset_index()
usage_CT_data["CT"] = usage_CT_data["CT"].astype(str)
neighbor_dist = list(usage_CT_data[usage_CT_data["GeneralUseType"] == "Commercial"].sort_values(
              by="use_ratio", ascending=False)["CT"][:20])

alt.Chart(usage_CT_data[usage_CT_data["CT"].isin(neighbor_dist)]).mark_bar().encode(
    x=alt.X('sum(use_ratio)', stack="normalize", axis=alt.Axis(title="Usage Ratio for specific buildings")),
    y=alt.Y("CT", axis=alt.Axis(title="Census Tract ID")),
    color=alt.Color("GeneralUseType", legend=alt.Legend(title=["Building", "Usage Type"]))
).properties(
    width = 500, 
    height = 400,
    title = {"text" : "Percentage of Building Usage Type in Each Neighborhood",
             "subtitle" : ["20 Neighborhoods (with census tract number) which has highest ratio for commercial properties", 
             "Region 701902 is selected to represent the Commercial District"]}
)
Out[6]:
  • Data Source: Los Angeles County Parcels Data

In the above graph, we can discover the ratio of the general usage type of all the properties in given regions. We discover that cencus tract 701902 has both high number and proportion of commercial properties. Thus, I define census tract 701902 as commercial district in the following research. Given the fact that region 702300 which contains the most commercial properties are close to tract 701902 (less than 5 min), the commercial districts is concentrated in this case.

In [0]:
#@title Default title text
selected_columns = ['TotalValue', 'SQFTmain', 'Bedrooms', 'Bathrooms', 'unemployment', 'pop', 'gini',
                    'median_income', 'poverty_rate', 'pop75', 'Travel_Time701902', 'Travel_Time267800', 
                    'CT', 'building_age']
geop_file = geop.read_file("Data_Viz_proj/data/full_sub_CL701902.geojson")
geop_file.loc[:, 'lon'] = geop_file.geometry.centroid.x
geop_file.loc[:, 'lat'] = geop_file.geometry.centroid.y
geop_sub = geop_file[geop_file["CT"] == 701902]
In [0]:
#@title Default title text
  def plot_geo_from_file(center_CT, select_col, col_type, title_dict, legend_title, tooltip_col, center_color="black"):
    '''
    The function take the center census and select columns and
    provide geographical visualization for them
    Inputs:
        center_CT: commercial census tract
        select_cols: columns for visualization
        col_type: the type of the selected columns
    '''
    new_selected = "properties." + select_col + ":Q"
    save_dir = "Data_Viz_proj/data/full_sub_CL" + center_CT + ".geojson"

    with open(save_dir) as f:
        LA_shape_g = json.load(f)
    
    data_geojson = alt.InlineData(values=LA_shape_g, format=alt.DataFormat(property='features',type='json'))

    chart = alt.Chart(data_geojson).mark_geoshape(stroke="black").encode(
                        color=alt.Color(new_selected, type='quantitative', scale=alt.Scale(scheme="yellowgreenblue"), title=legend_title),
                        tooltip=tooltip_col
                        ).properties(
                            width=500,
                            height=300,
                      title=title_dict
    )
    
    label = alt.Chart(geop_file[geop_file["CT"] == 701902]).mark_text(color=center_color, text=["Commercial", "District"]).encode(
                                             latitude="lat",
                                             longitude="lon",
                                             )

    return chart + label
In [11]:
tooltip_col = ["properties.CT:N", "properties.SQFTmain:Q", "properties.EffectiveYearBuilt:Q"]
title_dict = {"text": ["The Location of Commercial District in Santa Monica and the Corresponding Traveling Time to it"],
              "subtitle": ["Commercial District of Santa Monica is Located along the coast, traveling time is shorter along the coast"]}
plot_geo_from_file("701902", "Travel_Time701902", "Q", title_dict , ["Traveling Time", "To Commercial", "District"], tooltip_col) 
Out[11]:
  • Data Source: Los Angeles County Parcels Data, Uber Movement Data
In [0]:
#@title Default title text
com_per = pd.DataFrame(np.percentile(main_data[main_data["GeneralUseType"] == "Commercial"]["TotalValue"], np.arange(1,99,1)), columns=["Property_Value"])
com_per["Use_Type"] = "Commercial"
com_per["percentile"] = com_per.index
ind_per = pd.DataFrame(np.percentile(main_data[main_data["GeneralUseType"] == "Industrial"]["TotalValue"], np.arange(1,99,1)), columns=["Property_Value"])
ind_per["Use_Type"] = "Industrial"
ind_per["percentile"] = com_per.index
res_per = pd.DataFrame(np.percentile(main_data[main_data["GeneralUseType"] == "Residential"]["TotalValue"], np.arange(1,99,1)), columns=["Property_Value"])
res_per["Use_Type"] = "Residential"
res_per["percentile"] = res_per.index
total_per = pd.concat([res_per, com_per], join="inner")
total_per = pd.concat([total_per, ind_per], join="inner")
In [13]:
#@title Default title text
accu = alt.Chart(total_per).mark_area(opacity=0.3).encode(
    x=alt.X("percentile:O", title="Percentile of Each Property Type"),
    y=alt.Y("Property_Value:Q", title="Property Value of the Specific Percentile (Black Line: median property value)"),
    color=alt.Color("Use_Type:N", title="Property Type"),
    opacity=alt.value(0.8)
).properties(
    width = 600, 
    height = 400,
    title={"text": "Property Value of Each Percentile with Different Building Type",
           "subtitle": ["The property value increase exponentially at the higher percentile, the distribution is unbalance",
                        "Buildings with higher profitability (Commercial and Industrial) have higher value and skewedness"]}
)

global_median = alt.Chart(total_per).mark_rule(color='black').encode(
    y = alt.Y('median(Property_Value):Q'),
)

accu + global_median
Out[13]:
  • Data Source: Los Angeles County Parcels Data
In [0]:
#@title Default title text
MT_data = pd.DataFrame(columns=full_data.columns)
for c in pd.unique(full_data["CT"]):
    sub_data = full_data[full_data.CT == c].reset_index(drop=True)
    sub_size = sub_data.shape[0]
    sel_idx = np.random.permutation(sub_size)[:int((sub_size * 0.05) // 1)]
    MT_data = pd.concat([MT_data, sub_data.iloc[sel_idx]])
    
allCT = pd.unique(full_data["CT"])
geopf = geop.read_file("Data_Viz_proj/data/full_sub_CL701902.geojson")
sub_full = geopf[geopf["CT"].isin(allCT)]
sub_full.to_file("Data_Viz_proj/data/full_sub_CL701902_part.geojson", driver='GeoJSON')
In [15]:
#@title Default title text
save_dir = "Data_Viz_proj/data/full_sub_CL701902_part.geojson"
with open(save_dir) as f:
    LA_shape_g = json.load(f)

data_geojson = alt.InlineData(values=LA_shape_g, format=alt.DataFormat(property='features',type='json'))

background = alt.Chart(data_geojson).mark_geoshape(color="#f6f3f4").encode().properties(
                            width=600,
                            height=400
                        )

chart = alt.Chart(MT_data[MT_data.GeneralUseType != "(missing)"]
    ).mark_circle(size=25).encode(
        longitude='CENTER_LON:Q',
        latitude='CENTER_LAT:Q',
        color=alt.Color("GeneralUseType", title="Building Type"),
        opacity = alt.value(0.8)
    ).properties(
        width=600,
        height=400,
        title={"text": "The Sampled Locations for Properties in Different Usage Type",
               "subtitle": ["residential properties locate in the specific blocks and", 
                            "commercial properties tend to locate along the main roads and commercial district"]}
    )
background + chart
Out[15]:
  • Data Source: Los Angeles County Parcels Data
In [16]:
#@title Default title text
chart_SQFTmain = alt.Chart(plot_data).mark_rect().encode(
    x=alt.X('unemployment:Q', axis=alt.Axis(title="Unemployment Rate in the Neighborhood"), bin=alt.Bin(maxbins=20) ),
    y=alt.Y('median_income:Q', axis=alt.Axis(title="Median Income in the Neighborhood"), bin=alt.Bin(maxbins=20)),
    color=alt.Color('TotalValue:Q', title="Property Value")
)

circ = chart_SQFTmain.mark_point().encode(
    alt.ColorValue('grey'),
    alt.Size('pop',
        legend=alt.Legend(title=['Population in ', "the Region"])
    ),
    tooltip=alt.Tooltip('CT:N', title='Neighborhood')
)

(chart_SQFTmain + circ).properties(
    width = 500, 
    height = 400,
    title={"text": "Relation Between Median Income, Unemployment Rate, Population and Property Value",
           "subtitle": ["In general, neighborhood with higher median income tend to have higher median property value", 
                        "the main commercial district (blue cell at the right bottom) is surprisingly characterized with",
                        "high property values, low median income and high unemployment rate"]}
)
Out[16]:
  • Data Source: Los Angeles County Parcels Data, Uber Movement Data, American
In [17]:
#@title Default title text
tooltip_col = ["properties.CT:N", "properties.SQFTmain:Q", "properties.EffectiveYearBuilt:T"]
title_dict = {"text": ["The Distribution of Median Property Value Among Neighborhoods in Santa Monica"],
              "subtitle": ["the median property value in the commercial district is high because of its profitability", 
                           ", where most of the industrial and commercial properties located"]}
plot_geo_from_file("701902", "TotalValue", "Q", title_dict , ["Median Property", "Value in the", "Region"], tooltip_col, "white")
Out[17]:
  • Data Source: Los Angeles County Parcels Data, Uber Movement Data, American Community Survey
In [18]:
#@title Default title text
tooltip_col = ["properties.CT:N", "properties.poverty_rate:N", "properties.pop:Q"]
title_dict = {"text": ["The Distribution of Median Income Among Neighborhoods in Santa Monica"],
              "subtitle": ["the median income of the neighborhoods are low in the center of the city (commercial district)", 
                           "and it is relatively high in the sub-urban area"]}
plot_geo_from_file("701902", "median_income", "Q", title_dict , ["Median Income", "in the", "Region"], tooltip_col)
Out[18]:
  • Data Source: Los Angeles County Parcels Data, Uber Movement Data, American Community Survey
In [0]:
#@title Default title text
sqft_perc = np.percentile(main_data["SQFTmain"], np.arange(0,100,1))
main_data["SQFT_group"] = 0
for i, q in enumerate(sqft_perc[:-1]):
  main_data.loc[(main_data["SQFTmain"] > sqft_perc[i + 1]) & (main_data["GeneralUseType"] == "Residential"), "SQFT_group"] = i + 1
  rep_main = main_data.groupby("SQFT_group").median().reset_index()
In [20]:
#@title Default title text
rel1 = alt.Chart(rep_main[rep_main["SQFT_group"] <= 95]).mark_circle().encode(x=alt.X('SQFTmain:Q', axis=alt.Axis(title="Square Footage of a Building")), 
                                         y=alt.Y('TotalValue:Q', axis=alt.Axis(title="Approximated Property Value (Black line: median)")), 
                                         color=alt.Color("Travel_Time701902:Q", title= ["Traveling Time", "to Commercial","District"]),
                                         size=alt.Size("TotalValue", title="Property Value")
).properties(
    width = 600, 
    height = 400,
    title = {"text": ["Relation between Housing Value, Housing Square Footage, and Traveling Time to Commercial Districts",
             "for Residential Properties"],
             "subtitle": ["Linear relation between building square footage and the property value is clear until a certain level, ", 
                          "Buildings with higher square footage and higher value tend to have higher traveling time to the city center"]}
)

global_median = alt.Chart(rep_main).mark_rule(color='black').encode(
    y=alt.Y('median(TotalValue):Q'),
)

rel1 + global_median
Out[20]:

Note: House Value labelled in the graph is approximated by the median house value of given neighborhood

  • Source: Los Angeles County Parcels Data, Uber Movement Data, American Community Survey
In [0]:
#@title Default title text
sqft_perc = np.percentile(main_data["SQFTmain"], np.arange(0,100,1))
main_data["SQFT_group"] = 0
for i, q in enumerate(sqft_perc[:-1]):
  main_data.loc[(main_data["SQFTmain"] > sqft_perc[i + 1]) & (main_data["GeneralUseType"] == "Commercial"), "SQFT_group"] = i + 1
  rep_main = main_data.groupby("SQFT_group").median().reset_index()
In [22]:
#@title Default title text
rel2 = alt.Chart(rep_main[rep_main["SQFT_group"] <= 95]).mark_circle().encode(x=alt.X('SQFTmain:Q', axis=alt.Axis(title="Square Footage of a Building")), 
                                         y=alt.Y('TotalValue:Q', axis=alt.Axis(title="Total Value of a Building (Black line: median)")), 
                                         color=alt.Color("Travel_Time701902:Q", title= ["Traveling Time", "to Commercial","District"]),
                                         size=alt.Size("TotalValue", title="Property Value")
).properties(
    width = 600, 
    height = 400,
    title = {"text": ["Relation between Housing Value, Housing Square Footage, and Traveling Time to Residential Districts",
             "for Commercial Properties"],
             "subtitle": ["The linear relation between these characteristics for the residential properties is not observed in",
                          "the case of commercial properties"]}
)

global_median = alt.Chart(rep_main).mark_rule(color='black').encode(
    y=alt.Y('median(TotalValue):Q'),
)

rel2 + global_median
Out[22]:

Note: House Value labelled in the graph is approximated by the median house value of given Neighborhood

  • Source: Los Angeles County Parcels Data, Uber Movement Data, American Community Survey
In [0]:
#@title Default title text
def create_yearly_table(grouped_data, selected_columns, year_list,
                        group_col="CT", group_thres=np.arange(0,1,0.2)):
    '''
    The function take the summarized table with the selected columns and year list
    and provide information of the time trend for different sub-group specified in
    group_col parameter
    Inputs:
        grouped_data: summarized table for census tracts
        selected_columns: columns to be kept in the final table
        year_list: The tax year for House Value included
        group_col: columns for creating sub-groups
        group_thres: percentile taken to group the group_col
    Return: summarized table grouped by group_col and year
    '''
    final_df = grouped_data[selected_columns]
    final_df["year"] = 2018
    if group_col != "CT":
        for i, thres in enumerate(group_thres):
            final_df.loc[final_df[group_col] >= final_df[group_col].quantile(
                         thres), group_col + "_b"] = i * 20
    
    for year in year_list:
        temp = grouped_data
        valcol = "TotalValue" + str(year)
        temp["TotalValue"] = temp[valcol]
        temp = temp[selected_columns]
        temp["year"] = year
        if group_col != "CT":
            for i, thres in enumerate(group_thres):
                temp.loc[temp[group_col] >= temp[group_col].quantile(
                         thres), group_col + "_b"] = i * 20
            
        final_df = pd.concat([final_df, temp], join="inner")
    
    return final_df
In [0]:
#@title Default title text
plot_data["building_age"] = 2018 - plot_data["EffectiveYearBuilt"]
selected_columns = ['AIN', 'ZIPcode5', 'RollYear', 'TotalValue', 'EffectiveYearBuilt',
                     'SQFTmain', 'Bedrooms', 'Bathrooms', 'unemployment', 'pop', 'gini',
                     'median_income', 'poverty_rate', 'pop75', 'Travel_Time701902',
                     'Travel_Time267800', 'building_age']

final_plot_data = create_yearly_table(plot_data, selected_columns, [2014, 2015, 2016,2017])
In [25]:
#@title Default title text
step = 60
overlap = 1

alt.Chart(final_plot_data, height=step, width=600).transform_joinaggregate(
         median_value="median(TotalValue)", groupby=["year"]
    ).transform_bin(
         ['bin_max', 'bin_min'], 'TotalValue'
    ).transform_aggregate(
        value='count()', groupby=["year", "median_value", "bin_min", "bin_max"]
    ).transform_impute(
        impute="value", groupby=["year", "median_value"], key="bin_min", value=0
    ).mark_area(
        interpolate='monotone',
        fillOpacity=0.8,
        stroke='lightgray',
        strokeWidth=0.5
    ).encode(
        alt.X('bin_min:Q', bin="binned", title='Property Values between 2014 to 2018'),
        alt.Y(
            'value:Q',
            scale=alt.Scale(range=[step, -step * overlap]),
            axis=None
        ),
        alt.Fill(
            'median_value:Q',
            legend=alt.Legend(title=["Median Property", "Value of", "Communities"])
        )
    ).facet(
        row=alt.Row(
            'year:N',
            title=None,
            header=alt.Header(labelAngle=0, labelAlign='right')
    )
    ).properties(
        title= {"text": 'The Distribution of Community-Level Median Property Value Between 2014 - 2018',
                "subtitle": ["the median property values vary toward the right, which means that the increse is driven by the",
                "middle and high valued properties and the low-valued properties remain"]},
        bounds='flush'
    ).configure_facet(
        spacing=0
    ).configure_view(
        stroke=None
    ).configure_title(
        anchor='end'
    )
Out[25]:
  • Data Source: Los Angeles County Parcels Data, Uber Movement Data, American Community Survey

Reference

Included Encoding: Color, Text, Slope, Position, Area, Length, Shape

Included Scale: Nominal, Oridinal, Interval, Ratio

Source